Read the data again and check if it was successfully pulled. Perform data composition and clean up necessary NA values for simple answers.
Code
# View NA valuena_summary <-data.frame(Variable =c("SLEDAI", "C3", "C4"),NA_Count =c(sum(is.na(pheno_clean$sledai_at_baseline)),sum(is.na(pheno_clean$c3)),sum(is.na(pheno_clean$c4)) ))datatable(na_summary,caption ="NA Count Summary for Clinical Variables",options =list(pageLength =5, autoWidth =TRUE))
Code
#Replace NA value with median to ensure dimensional consistency.median_sledai <-median(pheno_clean$sledai_at_baseline, na.rm =TRUE)median_c3 <-median(pheno_clean$c3, na.rm =TRUE)median_c4 <-median(pheno_clean$c4, na.rm =TRUE)pheno_clean <- pheno_clean %>%mutate(sledai_at_baseline =ifelse(is.na(sledai_at_baseline), median_sledai, sledai_at_baseline),c3 =ifelse(is.na(c3), median_c3, c3),c4 =ifelse(is.na(c4), median_c4, c4) )summary_df <-as.data.frame(summary(pheno_clean))datatable(summary_df,caption ="Summary Statistics After NA Imputation",options =list(pageLength =10, autoWidth =TRUE))
4.2 Difference Analysis
4.2.1Exploration of risk grouping
According to the conclusions of Anian’s data cleaning and the references in the paper. Risk grouping is performed for the three important indicators of SLE activity index, C3, and C4.
Code
#Create risk classification based on different clinical indicatorspheno_clean$risk_group_sledai <-ifelse( pheno_clean$sledai_at_baseline >=10, "HighRisk_SLEDAI", "LowRisk_SLEDAI")pheno_clean$risk_group_c3 <-ifelse( pheno_clean$c3 <0.9, "HighRisk_C3", "LowRisk_C3")pheno_clean$risk_group_c4 <-ifelse( pheno_clean$c4 <0.1, "HighRisk_C4", "LowRisk_C4")#Calculate the sample size for each risk groupdf_sledai =as_tibble(table(pheno_clean$risk_group_sledai), .name_repair ="minimal")df_c3 =as_tibble(table(pheno_clean$risk_group_c3), .name_repair ="minimal")df_c4 =as_tibble(table(pheno_clean$risk_group_c4), .name_repair ="minimal")colnames(df_sledai) =c("Group", "Count_SLEDAI")colnames(df_c3) =c("Group", "Count_C3")colnames(df_c4) =c("Group", "Count_C4")df_combined_sledai_c3 =full_join(df_sledai, df_c3, by ="Group")df_combined =full_join(df_combined_sledai_c3, df_c4, by ="Group")kable(df_combined, caption ="Risk Group Statistics Based on SLEDAI, C3, and C4 Levels")
Risk Group Statistics Based on SLEDAI, C3, and C4 Levels
Group
Count_SLEDAI
Count_C3
Count_C4
HighRisk_SLEDAI
1073
NA
NA
LowRisk_SLEDAI
747
NA
NA
HighRisk_C3
NA
650
NA
LowRisk_C3
NA
1170
NA
HighRisk_C4
NA
NA
384
LowRisk_C4
NA
NA
1436
4.2.2 Differential analysis of SLEDAI risk grouping
The selection of threshold is based on data cleaning and the support of geo sourced literature. 2.And select significant genes based on the significance level of p<0.05.
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
4.3 Merge common significant genes
Merge and display the common significant genes of the three risk groups, so that the selected common significant genes under the three high reference indicators will definitely have stronger predictive performance.
Code
#Screen genes with p-values less than 0.05 in each analysis groupsignificant_sledai <- results_sledai[results_sledai$adj.P.Val <0.05,]significant_c3 <- results_c3[results_c3$adj.P.Val <0.05,]significant_c4 <- results_c4[results_c4$adj.P.Val <0.05,]#Find the intersection using gene namescommon_genes <-Reduce(intersect, list(rownames(significant_sledai), rownames(significant_c3), rownames(significant_c4)))#Can create a data box containing these common genetic informationcommon_genes_data <-data.frame(Gene = common_genes,PVal_SLEDAI = significant_sledai[common_genes, "adj.P.Val", drop =FALSE],PVal_C3 = significant_c3[common_genes, "adj.P.Val", drop =FALSE],PVal_C4 = significant_c4[common_genes, "adj.P.Val", drop =FALSE])datatable(common_genes_data, options =list(pageLength =10, autoWidth =TRUE),caption ="Common Significant Genes Across SLEDAI, C3, and C4 Analyses")
To avoid excessive rendering complexity in presenting files, only genes with good performance in common differences were displayed. Avoiding the occurrence of lagging situations.
---title: "Analysis(DE,Plot)"author: "ruinan"format: html: toc: true toc-location: left toc-depth: 3 embed-resources: true code-fold: true code-tools: trueeditor: source ---```{r message=FALSE, warning=FALSE}library(dplyr)library(knitr)library(limma)library(DT)library(readr)library(knitr)```# 4. Data Analysis## 4.1 Simple EDA#### Read the data again and check if it was successfully pulled. Perform data composition and clean up necessary NA values for simple answers.```{r echo=FALSE, message=FALSE, warning=FALSE}# 重新读取数据expr_df_matched <- readRDS("E:/data3888/cl/expr_df_matched.rds")pheno_clean <- readRDS("E:/data3888/cl/pheno_clean.rds")# 显示前几行数据(结构查看建议去 Console 或手动检查)library(DT)# 用交互式表格查看数据前几行datatable(head(expr_df_matched), caption = "Preview of expr_df_matched", options = list(pageLength = 5, autoWidth = TRUE))datatable(head(pheno_clean), caption = "Preview of pheno_clean", options = list(pageLength = 5, autoWidth = TRUE))``````{r}# View NA valuena_summary <-data.frame(Variable =c("SLEDAI", "C3", "C4"),NA_Count =c(sum(is.na(pheno_clean$sledai_at_baseline)),sum(is.na(pheno_clean$c3)),sum(is.na(pheno_clean$c4)) ))datatable(na_summary,caption ="NA Count Summary for Clinical Variables",options =list(pageLength =5, autoWidth =TRUE))``````{r}#Replace NA value with median to ensure dimensional consistency.median_sledai <-median(pheno_clean$sledai_at_baseline, na.rm =TRUE)median_c3 <-median(pheno_clean$c3, na.rm =TRUE)median_c4 <-median(pheno_clean$c4, na.rm =TRUE)pheno_clean <- pheno_clean %>%mutate(sledai_at_baseline =ifelse(is.na(sledai_at_baseline), median_sledai, sledai_at_baseline),c3 =ifelse(is.na(c3), median_c3, c3),c4 =ifelse(is.na(c4), median_c4, c4) )summary_df <-as.data.frame(summary(pheno_clean))datatable(summary_df,caption ="Summary Statistics After NA Imputation",options =list(pageLength =10, autoWidth =TRUE))```## 4.2 Difference Analysis### 4.2.1Exploration of risk grouping#### According to the conclusions of Anian's data cleaning and the references in the paper. Risk grouping is performed for the three important indicators of SLE activity index, C3, and C4.```{r}#Create risk classification based on different clinical indicatorspheno_clean$risk_group_sledai <-ifelse( pheno_clean$sledai_at_baseline >=10, "HighRisk_SLEDAI", "LowRisk_SLEDAI")pheno_clean$risk_group_c3 <-ifelse( pheno_clean$c3 <0.9, "HighRisk_C3", "LowRisk_C3")pheno_clean$risk_group_c4 <-ifelse( pheno_clean$c4 <0.1, "HighRisk_C4", "LowRisk_C4")#Calculate the sample size for each risk groupdf_sledai =as_tibble(table(pheno_clean$risk_group_sledai), .name_repair ="minimal")df_c3 =as_tibble(table(pheno_clean$risk_group_c3), .name_repair ="minimal")df_c4 =as_tibble(table(pheno_clean$risk_group_c4), .name_repair ="minimal")colnames(df_sledai) =c("Group", "Count_SLEDAI")colnames(df_c3) =c("Group", "Count_C3")colnames(df_c4) =c("Group", "Count_C4")df_combined_sledai_c3 =full_join(df_sledai, df_c3, by ="Group")df_combined =full_join(df_combined_sledai_c3, df_c4, by ="Group")kable(df_combined, caption ="Risk Group Statistics Based on SLEDAI, C3, and C4 Levels")```### 4.2.2 Differential analysis of SLEDAI risk grouping#### The selection of threshold is based on data cleaning and the support of geo sourced literature. 2.And select significant genes based on the significance level of p<0.05.```{r}threshold_sledai <-10pheno_clean$risk_group_SLEDAI <-ifelse(pheno_clean$sledai_at_baseline >= threshold_sledai, "HighRisk_SLEDAI", "LowRisk_SLEDAI")design_sledai <-model.matrix(~ pheno_clean$risk_group_SLEDAI)#Fit limma modelfit_sledai <-lmFit(expr_df_matched, design_sledai)#Applying eBayes methodfit_sledai <-eBayes(fit_sledai)#Extract differentially expressed genesresults_sledai <-topTable(fit_sledai, coef=2, n=Inf) results_sledai_filtered <- results_sledai[results_sledai$adj.P.Val <0.05, ] #Create an interactive table using the DT packagedatatable(round(results_sledai_filtered, 2), options =list(pageLength =10, autoWidth =TRUE),caption ="Significant Differentially Expressed Genes in SLEDAI (p < 0.05)")```### 4.2.3 Differential analysis of C3 risk grouping#### Process, threshold setting, and significance level setting are consistent with 4.2.1```{r message=FALSE, warning=FALSE}threshold_c3 <- 0.9pheno_clean$risk_group_C3 <- ifelse(pheno_clean$c3 < threshold_c3, "HighRisk_C3", "LowRisk_C3")design_c3 <- model.matrix(~ pheno_clean$risk_group_C3)fit_c3 <- lmFit(expr_df_matched, design_c3)fit_c3 <- eBayes(fit_c3)results_c3 <- topTable(fit_c3, coef=2, n=Inf) results_c3_filtered <- results_c3[results_c3$adj.P.Val < 0.05, ] datatable(round(results_c3_filtered, 2), options = list(pageLength = 10, autoWidth = TRUE), caption = "Significant Differentially Expressed Genes in C3 Analysis (p < 0.05)")```### 4.2.4 Differential analysis of C4 risk grouping#### Process, threshold setting, and significance level setting are consistent with 4.2.1```{r}# 假设C4得分阈值是0.1threshold_c4 <-0.1# 创建风险分组列pheno_clean$risk_group_C4 <-ifelse(pheno_clean$c4 < threshold_c4, "HighRisk_C4", "LowRisk_C4")# 为 C4 风险分组设置设计矩阵design_c4 <-model.matrix(~ pheno_clean$risk_group_C4)# 拟合 limma 模型fit_c4 <-lmFit(expr_df_matched, design_c4)# 应用 eBayes 方法fit_c4 <-eBayes(fit_c4)# 提取差异表达的基因results_c4 <-topTable(fit_c4, coef=2, n=Inf) # 提取所有基因results_c4_filtered <- results_c4[results_c4$adj.P.Val <0.05, ] # 筛选 p 值小于 0.05 的基因# 使用 DT 包来创建一个交互式表格datatable(round(results_c4_filtered, 2), options =list(pageLength =10, autoWidth =TRUE),caption ="Significant Differentially Expressed Genes in C4 Analysis (p < 0.05)")```## 4.3 Merge common significant genes#### Merge and display the common significant genes of the three risk groups, so that the selected common significant genes under the three high reference indicators will definitely have stronger predictive performance.```{r}#Screen genes with p-values less than 0.05 in each analysis groupsignificant_sledai <- results_sledai[results_sledai$adj.P.Val <0.05,]significant_c3 <- results_c3[results_c3$adj.P.Val <0.05,]significant_c4 <- results_c4[results_c4$adj.P.Val <0.05,]#Find the intersection using gene namescommon_genes <-Reduce(intersect, list(rownames(significant_sledai), rownames(significant_c3), rownames(significant_c4)))#Can create a data box containing these common genetic informationcommon_genes_data <-data.frame(Gene = common_genes,PVal_SLEDAI = significant_sledai[common_genes, "adj.P.Val", drop =FALSE],PVal_C3 = significant_c3[common_genes, "adj.P.Val", drop =FALSE],PVal_C4 = significant_c4[common_genes, "adj.P.Val", drop =FALSE])datatable(common_genes_data, options =list(pageLength =10, autoWidth =TRUE),caption ="Common Significant Genes Across SLEDAI, C3, and C4 Analyses")``````{r}# 保存筛选后的基因数据到 CSV 文件write.csv(common_genes_data, "Common_Significant_Genes.csv", row.names =FALSE)```# 5.Interactive chart display## 5.1. Interactive volcano map presentation#### To avoid excessive rendering complexity in presenting files, only genes with good performance in common differences were displayed. Avoiding the occurrence of lagging situations.```{r}results_sledai$Gene <-rownames(results_sledai)results_sledai$Time <-"SLEDAI"results_c3$Gene <-rownames(results_c3)results_c3$Time <-"C3"results_c4$Gene <-rownames(results_c4)results_c4$Time <-"C4"volcano_all <-rbind(results_sledai, results_c3, results_c4)volcano_all$IsCommon <-ifelse(volcano_all$Gene %in% common_genes, "Common", "NotCommon")volcano_common <-subset(volcano_all, IsCommon =="Common")library(ggplot2)library(plotly)p_common <-ggplot(volcano_common, aes(x = logFC, y =-log10(P.Value))) +geom_point(aes(color = Time, text = Gene), size =1.5, alpha =0.8) +facet_wrap(~ Time) +labs(x ="logFC", y ="-log10(p value)", title ="Common Significant Genes") +theme_minimal()p_interactive <-ggplotly(p_common, tooltip ="text")p_interactive```## 5.2. MA interaction diagram presentation#### Presented in the form of MA charts, the data filtering is consistent with volcano charts.```{r}p_ma <-ggplot(volcano_common, aes(x = AveExpr, y = logFC)) +geom_point(aes(color = Time, text = Gene), size =1.5, alpha =0.8) +facet_wrap(~ Time) +labs(x ="Average Expression (log2)", y ="log2 Fold Change", title ="MA Plot - Common DEGs") +theme_minimal()p_ma_interactive <-ggplotly(p_ma, tooltip ="text")p_ma_interactive```